Much of the code and examples are copied/modified from
Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.
# this variable controls the range of n-grams used by CountVectorizer/TfidfVectorizer
# and, therefore, the n-grams the topic modeling will use
n_gram_range = (1, 3)
# specify stop words specific to this dataset
custom_stop_words = {'united', 'nations', 'nation'}
# specify the number of topics the NMF/LDA will create
number_of_topics = 10
cd ../..
/Users/shanekercheval/repos/nlp-template
%run "source/config/notebook_settings.py"
pd.set_option('display.max_colwidth', None)
from source.library.utilities import Timer, get_logger
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, count_keywords_by, impurity
from source.library.sklearn_topic_modeling import *
with Timer("Loading Data"):
path = 'artifacts/data/processed/un-general-debates-paragraphs.pkl'
paragraphs = pd.read_pickle(path)
Started: Loading Data Finished (0.12 seconds)
This section provides a basic exploration of the text and dataset.
hlp.pandas.numeric_summary(paragraphs)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | 279,045 | 0 | 0.0% | 0 | 0.0% | 1,992.4 | 12.6 | 0.0 | 0.1 | -1.1 | 1,970 | 1,975.0 | 1,982.0 | 1,993.0 | 2,003.0 | 2,010.0 | 2,015 |
hlp.pandas.non_numeric_summary(paragraphs)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| country | 279,045 | 0 | 0.0% | Russian Federation | 199 | 0.1% |
| text | 279,045 | 0 | 0.0% | The President returned to the [...] | 278,820 | 99.9% |
assert not (paragraphs['text'].str.strip() == '').any()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
stopwords |= custom_stop_words
stopwords |= {'ll', 've'}
paragraphs = paragraphs.sample(2000)
#paragraphs.to_pickle('source/tests/test_files/datasets/un_debates_paragraphs_sample.pkl')
NOTE: TF seems to be used with LDA rather than TF-IDF
with Timer("Calculating TF & TF-IDF (1-3 ngrams)"):
count_vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=5, max_df=0.7)
count_vectors = count_vectorizer.fit_transform(paragraphs["text"])
print(count_vectors.shape)
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=5, max_df=0.7)
tfidf_vectors = tfidf_vectorizer.fit_transform(paragraphs["text"])
tfidf_vectors.shape
Started: Calculating TF & TF-IDF (1-3 ngrams) (2000, 3531) Finished (0.29 seconds)
import matplotlib.pyplot as plt
def plot_top_words(model, feature_names, n_top_words, title):
"""
https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
"""
fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
top_features = [feature_names[i] for i in top_features_ind]
weights = topic[top_features_ind]
ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
ax.invert_yaxis()
ax.tick_params(axis="both", which="major", labelsize=20)
for i in "top right left".split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.show()
def display_topics(model, features, no_top_words=5):
for topic, words in enumerate(model.components_):
total = words.sum()
largest = words.argsort()[::-1] # invert sort order
print("\nTopic %02d" % topic)
for i in range(0, no_top_words):
print(" %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))
from sklearn.decomposition import NMF
nmf_model = NMF(init='nndsvda', n_components=number_of_topics, random_state=42, max_iter=1000)
_ = nmf_model.fit_transform(tfidf_vectors)
nmf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_topics(
model=nmf_model,
features=nmf_feature_names,
top_n_tokens=8,
num_tokens_in_label=2
)
plot_topic_sizes(
model=nmf_model,
dataset=tfidf_vectors,
features=nmf_feature_names,
)
predicted_topics = nmf_model.transform(X=tfidf_vectors)
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;
def get_topic_sizes_per_year(model, features, vectorizer):
topic_labels = create_topic_labels(
model=model,
features=features,
token_separator=' | ',
top_n_tokens=2,
)
topic_labels.values()
years = paragraphs['year'].unique()
years.sort()
def get_segment_sizes(paragraphs):
new_data = vectorizer.transform(paragraphs)
sizes = calculate_topic_sizes(model=model, dataset=new_data)
return sizes
sizes_per_year = {year: get_segment_sizes(paragraphs.query(f'year == {year}')['text'])
for year in years}
yearly_dict = {year: {topic: value
for topic, value in zip(topic_labels.values(), sizes_per_year[year])}
for year in years}
df = pd.DataFrame(yearly_dict).reset_index().rename(columns={'index': 'topic_labels'})
column_values = df.columns
df = pd.melt(df, id_vars='topic_labels', value_vars=list(column_values), var_name='year')
return df
topic_sizes_per_year = get_topic_sizes_per_year(
model=nmf_model,
features=nmf_feature_names,
vectorizer=tfidf_vectorizer
)
topic_sizes_per_year.head()
| topic_labels | year | value | |
|---|---|---|---|
| 0 | world | peace | 1970 | 0.11 |
| 1 | assembly | session | 1970 | 0.06 |
| 2 | countries | developing | 1970 | 0.07 |
| 3 | rights | human | 1970 | 0.22 |
| 4 | nuclear | weapons | 1970 | 0.19 |
fig = px.area(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.bar(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.line(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.scatter(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
trendline="lowess",
opacity=0.0,
title="Topics Over Time",
)
fig.show()
import pyLDAvis.sklearn
lda_display = pyLDAvis.sklearn.prepare(nmf_model, tfidf_vectors, tfidf_vectorizer, sort_topics=False)
# pyLDAvis.display(lda_display)
file_name = f"docs/models/nmf-n-grams-{n_gram_range[0]}-{n_gram_range[1]}.html"
pyLDAvis.save_html(lda_display, file_name)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pandas/core/internals/blocks.py:402: RuntimeWarning: divide by zero encountered in log /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pyLDAvis/_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pandas/core/internals/blocks.py:402: RuntimeWarning: divide by zero encountered in log /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload
Neither the book nor the example above uses TF-IDF with LDA, but do not specify why. Both use TF-IDF with NMF and then change to CountVectorizer with LDA
LDA only needs a bag-of-word vector.
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components = 10, random_state=42)
_ = lda_model.fit_transform(count_vectors)
lda_feature_names = count_vectorizer.get_feature_names_out()
plot_topics(
model=lda_model,
features=lda_feature_names,
top_n_tokens=8,
num_tokens_in_label=2,
token_separator=' | '
)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
plot_topic_sizes(
model=lda_model,
dataset=count_vectors,
features=lda_feature_names,
top_n_tokens=3,
token_separator=' | '
)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
predicted_topics = lda_model.transform(X=count_vectors)
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;
topic_sizes_per_year = get_topic_sizes_per_year(
model=lda_model,
features=lda_feature_names,
vectorizer=count_vectorizer
)
topic_sizes_per_year.head()
| topic_labels | year | value | |
|---|---|---|---|
| 0 | countries | human | 1970 | 0.10 |
| 1 | international | peace | 1970 | 0.13 |
| 2 | peace | people | 1970 | 0.07 |
| 3 | nuclear | weapons | 1970 | 0.13 |
| 4 | general | assembly | 1970 | 0.05 |
fig = px.area(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
fig = px.bar(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
fig = px.line(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
fig = px.scatter(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
trendline="lowess",
opacity=0.0,
title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
import pyLDAvis.sklearn
lda_display = pyLDAvis.sklearn.prepare(lda_model, count_vectors, count_vectorizer, sort_topics=False)
# pyLDAvis.display(lda_display)
file_name = f"docs/models/lda-n-grams-{n_gram_range[0]}-{n_gram_range[1]}.html"
pyLDAvis.save_html(lda_display, file_name)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pyLDAvis/_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.